# Validation networks (House caucuses, Senate co-sponsorships, Mexico boards): 
#   R code for generating tables and graphics for
#   "Scraping public co-occurrences for statistical network analysis 
#    of political elites"
# 
# 
# Created by Paasha Mahdavi
# This version: 9 July 2017
#
#-----------------------------------------------------------------------------#
#    Loading packages and data                                                #
#-----------------------------------------------------------------------------#

rm(list = ls())

# Install required packages
pkg <- c("TeachingDemos","statnet","stargazer","miscTools","stringr","texreg","ggplot2",
         "gridExtra","dplyr")
inst <- pkg %in% installed.packages()
if (length(pkg[!inst]) > 0) install.packages(pkg[!inst])
rm(pkg, inst)

# Load required packages 
library("TeachingDemos")
library("statnet")
library("stargazer")
library("miscTools")
library("stringr")
library("texreg")
library("ggplot2")
library("gridExtra")
library("dplyr")



#-----------------------------------------------------------------------------#
#    Setting up code for generating log file                                  #
#-----------------------------------------------------------------------------#

txtStart(file = "network-centrality-replication.log", commands = TRUE, results = TRUE)



#-----------------------------------------------------------------------------#
#    Senate co-sponsorships data (Fowler 2006)                                #
#-----------------------------------------------------------------------------#

cospons <- read.csv("cosponsorships.csv", header=FALSE)
names(cospons) <- c("lastname1","lastname2","degree")
cospons.b <- cospons[1:4950,c("lastname2","lastname1","degree")]
names(cospons.b) <- c("lastname1","lastname2","degree")
cospons2 <- data.frame(rbind(cospons,cospons.b))
cospons2 <- cospons2 %>% arrange(lastname2)
cospons2 <- cospons2 %>% arrange(lastname1)


# Making into network matrix (adjacency or sociomatrix)

nwk.cospons <- reshape(cospons2, v.names="degree", timevar="lastname2", 
                 direction="wide", idvar="lastname1")
nwk.cospons <- as.matrix(nwk.cospons[1:nrow(nwk.cospons),2:ncol(nwk.cospons)])


# Creating centrality list and EV centrality scores

cospons.cent <- read.csv("fowler108full.csv",header=TRUE)
rownames(nwk.cospons) <- cospons.cent$labels
colnames(nwk.cospons) <- cospons.cent$labels
cospons.cent$net.ev <- evcent(nwk.cospons, ignore.eval=FALSE)




#-----------------------------------------------------------------------------#
#    House caucuses data (Victor & Ringe 2009)                                #
#-----------------------------------------------------------------------------#

full435 <- read.csv("full435.csv")[,2:3]
house <- read.csv("caucus.csv", header=FALSE)
names(house) <- c("lastname1","lastname2","degree")


house2 <- merge(full435,house, by=c("lastname1","lastname2"),all.x = TRUE)
house2 <- house2 %>% arrange(lastname2)
house2 <- house2 %>% arrange(lastname1)
house2$degree <- ifelse(is.na(house2$degree),0,house2$degree)

# Making into network matrix (adjacency or sociomatrix)

nwk.caucus <- reshape(house2, v.names="degree", timevar="lastname2", 
                 direction="wide", idvar="lastname1")
nwk.caucus <- as.matrix(nwk.caucus[1:nrow(nwk.caucus),2:ncol(nwk.caucus)])


# Creating centrality list

caucus.cent <- read.csv("110housefull.csv",header=FALSE)
names(caucus.cent) <- "labels"
caucus.cent <- caucus.cent %>% arrange(labels)
rownames(nwk.caucus) <- caucus.cent$labels
colnames(nwk.caucus) <- caucus.cent$labels


# Computing centrality measures

nwk.caucus <- log(nwk.caucus+1) # taking logs first because of high skew of scraped data
# NOTE: correlation between two is even higher without taking logs

caucus.cent$net.ev <- evcent(nwk.caucus, ignore.eval=FALSE)


# Combining with Victor and Ringe

vicrin <- read.csv("victorringe.csv")
caucus.val <- merge(vicrin,caucus.cent,by="labels")
caucus.val <- caucus.val[c(-11,-15),] # Removing Joe Wilson and McNulty (names too common)




#-----------------------------------------------------------------------------#
#    Mexico board data (Avina-Vazquez & Uddin 2013)                           #
#-----------------------------------------------------------------------------#

mexico <- read.csv("mexicoboard.csv", header=FALSE)
names(mexico) <-c("lastname1","lastname2","degree")
mexico.b <- mexico[1:253,c("lastname2","lastname1","degree")]
names(mexico.b) <- c("lastname1","lastname2","degree")
mexico2 <- data.frame(rbind(mexico,mexico.b))
mexico2 <- mexico2 %>% arrange(lastname2)
mexico2 <- mexico2 %>% arrange(lastname1)


# Making into network matrix (adjacency or sociomatrix)

nwk.mexico <- reshape(mexico2, v.names="degree", timevar="lastname2", 
                 direction="wide", idvar="lastname1")
nwk.mexico <- as.matrix(nwk.mexico[1:nrow(nwk.mexico),2:ncol(nwk.mexico)])


# Creating centrality list

mexico.cent <- read.csv("mexico24.csv",header=FALSE)
names(mexico.cent) <- "labels"
mexico.cent <- mexico.cent %>% arrange(labels)
rownames(nwk.mexico) <- mexico.cent$labels
colnames(nwk.mexico) <- mexico.cent$labels


# Computing centrality measures

nwk.mexico <- log(nwk.mexico+1) # taking logs first because of high skew of scraped data
mexico.cent$net.ev <- evcent(nwk.mexico, ignore.eval=FALSE)


# Combining with Avina Vazquez

apira <- read.csv("apira.csv")
mexico.val <- merge(apira,mexico.cent,by="labels")




#-----------------------------------------------------------------------------#
#    Appendix Figure 4                                                        #
#-----------------------------------------------------------------------------#


# EV centrality plots for each of the three datasets 

# EV plot for Senate co-sponsorships

g1 <- ggplot(cospons.cent,aes(x=net.ev,y=evcent,label=lastname))
rho1 <- with(cospons.cent,cor(net.ev,evcent))
lbl1 <- paste(expression(rho)," == ",round(rho1,3))

p1 <- g1 + geom_point(size=3) + ylim(0,.15) +
  geom_smooth(method="lm") + 
  labs(x="Eigenvector centrality (Google correlations)", 
       y = "Eigenvector centrality (Fowler)") +
  annotate("text", label =  lbl1, x = 0.03, y = 0.145, parse = TRUE, col = "darkred") +
  annotate("text", label =  c("Cochran","Frist","Lautenberg","Shelby","Carper","Bennett"),
           x = c(.157,.035,.183,.155,.09,.04), y = c(.093,.13,.064,.0402,.0115,.015)) +
  theme_classic()



# EV plot for House caucus network

g2 <- ggplot(caucus.val,aes(x=net.ev,y=evcent))
rho2 <- cor(caucus.val$evcent,caucus.val$net.ev)
lbl2 <- paste(expression(rho)," == ",round(rho2,3))

p2 <- g2 + geom_point(size=3) + 
  geom_smooth(method="lm") + 
  labs(x="Eigenvector Centrality (Google correlations)", 
       y = "Eigenvector Centrality (Victor-Ringe)") +
  annotate("text", label =  lbl2, x = 0.08, y = 15.2, parse = TRUE, col = "darkred") +
  annotate("text", label = c("Waxman","Doggett","Smith","Holt"),
           x= c(0.4,0.13,0.34,0.14), y = c(15.16,14.2,12.61,12.44)) +
  theme_classic()



# EV plot for Mexico board network

g3 <- ggplot(mexico.val[!is.na(mexico.val$evcent),],aes(x=net.ev,y=evcent))
rho3 <- cor(mexico.val[,c("evcent","net.ev")], use = "pairwise.complete.obs")[1,2]
lbl3 <- paste(expression(rho)," == ",round(rho3,3))

p3 <- g3 + geom_point(size=3) +  
  geom_smooth(method="lm") + 
  labs(x="Eigenvector Centrality (Google correlations)", 
       y = "Eigenvector Centrality (Avina-Vazquez)") +
  annotate("text", label =  lbl3, x = 0.08, y = 0.266, parse = TRUE, col="darkred") +
  annotate("text", label = c("Corral","Perez","Gual","Gonzalez"), 
           x = c(.19,.15,.31,.33), y = c(.189,.228,.213,.2525)) +
  theme_classic()



# Figure 4: all three in one PNG file

g <- grid.arrange(p1, p2, p3, ncol = 3)

ggsave(plot = g, filename = "AppendixFigure4.png", width = 18, height = 6, units = "in", dpi = 600)


#-----------------------------------------------------------------------------#
#    Stop generating log file                                                 #
#-----------------------------------------------------------------------------#

txtStop()
